{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "##################################################################################\n",
    "# 分为三个部分\n",
    "# 1. tokenizer部分\n",
    "# 2. transformer部分\n",
    "# 3. pooling部分\n",
    "\n",
    "\n",
    "# from multiprocessing.pool import Pool\n",
    "import numpy as np\n",
    "import onnxruntime\n",
    "import psutil\n",
    "from sympy import im\n",
    "from transformers import (AutoConfig, AutoModel, AutoTokenizer)\n",
    "import os\n",
    "import json\n",
    "from sentence_transformers.models import Pooling\n",
    "\n",
    "from sentence_transformers import SentenceTransformer as sbert\n",
    "\n",
    "from tqdm import tqdm\n",
    "import torch as t \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "##################################################################################\n",
    "# 处理transformer和 tokenizer部分\n",
    "\n",
    "big_model_path = \"../models/paraphrase-multilingual-MiniLM-L12-v2\"\n",
    "\n",
    "modules_json_path = os.path.join(big_model_path, 'modules.json')\n",
    "with open(modules_json_path) as fIn:\n",
    "    modules_config = json.load(fIn)\n",
    "\n",
    "tf_from_s_path = os.path.join(big_model_path, modules_config[0].get('path'))\n",
    "\n",
    "\n",
    "# 基本参数\n",
    "\n",
    "max_seq_length = 128\n",
    "doc_stride = 128\n",
    "max_query_length = 64\n",
    "# Enable overwrite to export onnx model and download latest script each time when running this notebook.\n",
    "enable_overwrite = True\n",
    "# Total samples to inference. It shall be large enough to get stable latency measurement.\n",
    "total_samples = 1000\n",
    "\n",
    "\n",
    "# # Load pretrained model and tokenizer\n",
    "# Load pretrained model and tokenizer\n",
    "config_class, model_class, tokenizer_class = (\n",
    "    AutoConfig, AutoModel, AutoTokenizer)\n",
    "\n",
    "cache_dir = os.path.join(\".\", \"cache_models\")\n",
    "config = config_class.from_pretrained(tf_from_s_path, cache_dir=cache_dir)\n",
    "tokenizer = tokenizer_class.from_pretrained(\n",
    "    tf_from_s_path, do_lower_case=True, cache_dir=cache_dir)\n",
    "model_transformer = model_class.from_pretrained(\n",
    "    tf_from_s_path, from_tf=False, config=config, cache_dir=cache_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-02-25 14:01:58.432835797 [W:onnxruntime:, inference_session.cc:1407 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "##################################################################################\n",
    "# 使用onnx 和cuda推理部分\n",
    "\n",
    "output_dir = os.path.join(\"..\", \"onnx_models\")\n",
    "export_model_path = os.path.join(output_dir, 'Multilingual_MiniLM_L12.onnx')\n",
    "device_name = 'gpu'\n",
    "sess_options = onnxruntime.SessionOptions()\n",
    "sess_options.optimized_model_filepath = os.path.join(\n",
    "    output_dir, \"optimized_model_{}.onnx\".format(device_name))\n",
    "# Please change the value according to best setting in Performance Test Tool result.\n",
    "sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)\n",
    "session = onnxruntime.InferenceSession(\n",
    "    export_model_path, sess_options, providers=['CUDAExecutionProvider'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "##################################################################################\n",
    "# 处理pooling部分\n",
    "pooling_model_path = os.path.join(big_model_path, modules_config[1].get('path'))\n",
    "pooling_model = Pooling.load(pooling_model_path)\n",
    "# pooling_model_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2000/2000 [00:05<00:00, 340.29it/s]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "##################################################################################\n",
    "# 推理函数\n",
    "\n",
    "\n",
    "def inferpart(session, sentences, pooling_model):\n",
    "    if isinstance(sentences, str) or not hasattr(sentences, '__len__'):\n",
    "        sentences = [sentences]\n",
    "\n",
    "    inputs = tokenizer(\n",
    "        sentences,\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=512,\n",
    "        return_tensors=\"pt\"\n",
    "    )\n",
    "    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}\n",
    "    ort_outputs_gpu = session.run(None, ort_inputs)\n",
    "    ort_result = pooling_model.forward(features={'token_embeddings':t.Tensor(ort_outputs_gpu[0]),\n",
    "    'attention_mask':inputs.get('attention_mask')})\n",
    "    result = ort_result.get('sentence_embedding')\n",
    "    return result\n",
    "\n",
    "\n",
    "_ = [inferpart(session=session, sentences = ['您好'], pooling_model=pooling_model) for i in tqdm(range(2000))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2000/2000 [00:25<00:00, 77.36it/s]\n"
     ]
    }
   ],
   "source": [
    "# 使用原生的sentence transformer代码\n",
    "model_sbert_raw = sbert(big_model_path, device='cuda')\n",
    "\n",
    "_ = [model_sbert_raw.encode(['您好'],device='cuda') for i in tqdm(range(2000))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "bdbb7989f40b9f457e507bcd5a50987663e2e8c6eeb0c3d4331f0628f0ad53bb"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('mynet')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
